# Importing SPC class
from SPyChart import SPC
# Pandas and numpy to create example input time series datasets
import pandas as pd
import numpy as np
"""
Artificial daily data with three distinct process changes, sampled from a Normal distribution.
"""
# Empty dataframes
df_1 = pd.DataFrame()
df_2 = pd.DataFrame()
df_3 = pd.DataFrame()
# Create each dataframe sampling from normal dist. with different params.
df_1['ds'] = pd.date_range(start='2020-01-01', end='2020-03-01', freq='d')
df_1['TARGET'] = np.random.normal(loc=50, scale=10, size=len(df_1))
df_2['ds'] = pd.date_range(start='2020-03-02', end='2020-06-01', freq='d')
df_2['TARGET'] = np.random.normal(loc=40, scale=6, size=len(df_2))
df_3['ds'] = pd.date_range(start='2020-06-02', end='2020-09-01', freq='d')
df_3['TARGET'] = np.random.normal(loc=60, scale=7, size=len(df_3))
# Combine all datasets into one, with clear process changes at 2020-03-02 and 2020-06-02.
df = pd.concat([df_1, df_2, df_3])
df = df.set_index('ds', drop=True)
# Show first five rows.
df.head()
| TARGET | |
|---|---|
| ds | |
| 2020-01-01 | 61.273469 |
| 2020-01-02 | 44.770683 |
| 2020-01-03 | 39.007239 |
| 2020-01-04 | 52.765780 |
| 2020-01-05 | 54.500268 |
In this example, we will create a simple SPC chart. For this example, we will ignore any process changes and plot the SPC chart with only a single calculation for the control lines (i.e., the control lines will take all data into account).
spc_example = SPC(data_in=df, # data we created above (input dataset)
target_col='TARGET', # name of column to analyse (existing within df)
chart_type='XmR-chart') # We will use an XmR chart in this example
# First method to calculate control lines and store the data within the class.
spc_example.setup()
# Next, we call the check_rules() method to test SPC rules, using the above calculated control lines.
spc_example.check_rules()
# Plot the resulting chart in Plotly (interactive & downloadable plot)
spc_example.plot_spc(title='Example SPC (XmR-chart) - Basic usage')
# If we want the data to build our own SPC charts, use the spc_data attribute.
# Note this returns 2 dataframes.
# - Data for the individual Chart (top chart)
# - Data for the mR Chart (bottom chart)
spc_data_X, spc_data_Y = spc_example.spc_data
spc_data_X.head()
| ds | TARGET | cl | lcl | ucl | +1sd | -1sd | +2sd | -2sd | Rule 1 violation | Rule 2 violation | Rule 3 violation | Rule 4 violation | Rule 5 violation | chart type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-01 | 61.273469 | 50.600264 | 28.892831 | 72.307696 | 57.836075 | 43.364453 | 65.071886 | 36.128642 | 0 | 0 | 0 | 0 | 0 | X-chart |
| 1 | 2020-01-02 | 44.770683 | 50.600264 | 28.892831 | 72.307696 | 57.836075 | 43.364453 | 65.071886 | 36.128642 | 0 | 0 | 0 | 0 | 0 | X-chart |
| 2 | 2020-01-03 | 39.007239 | 50.600264 | 28.892831 | 72.307696 | 57.836075 | 43.364453 | 65.071886 | 36.128642 | 0 | 0 | 0 | 0 | 0 | X-chart |
| 3 | 2020-01-04 | 52.765780 | 50.600264 | 28.892831 | 72.307696 | 57.836075 | 43.364453 | 65.071886 | 36.128642 | 0 | 0 | 0 | 0 | 0 | X-chart |
| 4 | 2020-01-05 | 54.500268 | 50.600264 | 28.892831 | 72.307696 | 57.836075 | 43.364453 | 65.071886 | 36.128642 | 0 | 0 | 0 | 0 | 0 | X-chart |
# Note that spc_data_Y will be None in SPC charts that only include a single chart (i.e. Individual-chart).
spc_data_Y.head()
| ds | TARGET | r | cl | lcl | ucl | +1sd | -1sd | +2sd | -2sd | Rule 1 violation | Rule 2 violation | Rule 3 violation | chart type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-01 | 61.273469 | NaN | 8.161995 | 0 | 34.851717 | -18.527728 | 34.851717 | -45.217451 | 61.54144 | 0 | 0 | 0 | mR-chart |
| 1 | 2020-01-02 | 44.770683 | 16.502786 | 8.161995 | 0 | 34.851717 | -18.527728 | 34.851717 | -45.217451 | 61.54144 | 0 | 0 | 0 | mR-chart |
| 2 | 2020-01-03 | 39.007239 | 5.763444 | 8.161995 | 0 | 34.851717 | -18.527728 | 34.851717 | -45.217451 | 61.54144 | 0 | 0 | 0 | mR-chart |
| 3 | 2020-01-04 | 52.765780 | 13.758540 | 8.161995 | 0 | 34.851717 | -18.527728 | 34.851717 | -45.217451 | 61.54144 | 0 | 0 | 0 | mR-chart |
| 4 | 2020-01-05 | 54.500268 | 1.734488 | 8.161995 | 0 | 34.851717 | -18.527728 | 34.851717 | -45.217451 | 61.54144 | 0 | 0 | 0 | mR-chart |
If you know a process change is occuring, and want to re-calculate the control lines to reflect the change, we can include the change_dates argument (must be a list of date(s)) in the initialisation, shown below.
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='XmR-chart',
change_dates=['2020-03-02', '2020-06-02']) # We will specify change date (must be in a list)
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (XmR-chart) - Re-calculating control lines')
Now, suppose we didn't know a whether a change in the system had an impact on our measured process. Rather than using the change_dates parameter, we can use the baseline_date parameter, to calculate control lines only on data before this specified date. This will give an indication of the impact to the measured process following a change to the system.
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='Individual-chart',
baseline_date='2020-03-01') # we will specify baseline date
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (Individual-chart) - Setting baseline date')
Our examples have used daily data, but data of any frequency can be used.
"""
Artificial hourly data with three distinct process changes, sampled from a Poisson distribution.
"""
df = pd.DataFrame()
df['ds'] = pd.date_range(start='2020-01-01', end='2020-02-01', freq='H')
df['TARGET'] = np.random.poisson(lam = 10, size=len(df))
df = df.set_index('ds', drop=True)
# Print first few rows
df.head()
| TARGET | |
|---|---|
| ds | |
| 2020-01-01 00:00:00 | 11 |
| 2020-01-01 01:00:00 | 14 |
| 2020-01-01 02:00:00 | 5 |
| 2020-01-01 03:00:00 | 15 |
| 2020-01-01 04:00:00 | 6 |
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='c-chart')
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (c-chart) - Hourly data')
Some SPC charts take averages of samples of data, meaning each day (or hour, week, etc...), has n samples.
# Creating dataset
df_1 = pd.DataFrame()
df_2 = pd.DataFrame()
df_1['ds'] = list(pd.date_range(start='2020-01-01', end='2020-03-01', freq='w'))*2
df_1['TARGET'] = np.random.normal(loc=40, scale=10, size=len(df_1))
df_2['ds'] = list(pd.date_range(start='2020-03-02', end='2020-09-01', freq='w'))*2
df_2['TARGET'] = np.random.normal(loc=65, scale=10, size=len(df_2))
df = pd.concat([df_1, df_2], axis = 0).sort_values(by = 'ds').reset_index(drop=True)
df = df.set_index('ds', drop=True)
df.head()
| TARGET | |
|---|---|
| ds | |
| 2020-01-05 | 40.241895 |
| 2020-01-05 | 45.335876 |
| 2020-01-12 | 24.715988 |
| 2020-01-12 | 23.153994 |
| 2020-01-19 | 31.363442 |
spc_example = SPC(data_in=df,
target_col='TARGET', #
chart_type='XbarR-chart',
change_dates=['2020-03-01'])
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (X bar R -chart) - Re-calculating control lines')
Duplicate dates detected. Constant sample size = 2
Generally a sample size $>=5$ qualifies the use of the $\bar{X}S - chart$ (standard deviation rather than range)
# Weekly data
df = pd.DataFrame()
df['ds'] = list(pd.date_range(start='2020-01-01', end='2020-06-01', freq='W'))*10
df['TARGET'] = np.random.normal(loc=50, scale=10, size=len(df))
df = df.set_index('ds', drop=True)
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='XbarS-chart')
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (X bar S -chart) - Basic usage')
Duplicate dates detected. Constant sample size = 10
With p-charts, u-charts and np-charts, you'll need to feed in an additional 'n' column, represnting the sample size, in order to calulate proportions.
df = pd.DataFrame()
df['ds'] = list(pd.date_range(start='2020-01-01', end='2020-01-25', freq='d'))
df['TARGET'] = np.random.randint(5, 20, len(df))
df['n'] = np.random.randint(50, 80, len(df))
df = df.set_index('ds', drop=True)
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='p-chart')
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (p-chart) - Basic usage')
df = pd.DataFrame()
df['ds'] = list(pd.date_range(start='2020-01-01', end='2020-01-25', freq='d'))
df['TARGET'] = np.random.randint(1, 10, len(df))
df['n'] = 50
df = df.set_index('ds', drop=True)
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='np-chart')
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (np-chart) - Basic usage')
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='u-chart')
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (u-chart) - Basic usage')
# Creating another dataset.
df_1 = pd.DataFrame()
df_2 = pd.DataFrame()
df_1['ds'] = list(pd.date_range(start='2020-01-01', end='2020-01-15', freq='d'))
df_2['ds'] = list(pd.date_range(start='2020-01-16', end='2020-01-30', freq='d'))
df_1['TARGET'] = np.random.randint(5, 20, len(df_1))
df_2['TARGET'] = np.random.randint(25, 40, len(df_2))
df_1['n'] = np.random.randint(50, 80, len(df_1))
df_2['n'] = np.random.randint(40, 50, len(df_2))
df = pd.concat([df_1, df_2], axis = 0).reset_index(drop=True)
df = df.set_index('ds', drop=True)
df.head()
| TARGET | n | |
|---|---|---|
| ds | ||
| 2020-01-01 | 13 | 70 |
| 2020-01-02 | 11 | 55 |
| 2020-01-03 | 13 | 73 |
| 2020-01-04 | 12 | 73 |
| 2020-01-05 | 19 | 75 |
spc_example = SPC(data_in=df,
target_col='TARGET',
chart_type='u-chart',
change_dates=['2020-01-15'])
spc_example.setup()
spc_example.check_rules()
spc_example.plot_spc(title='Example SPC (u-chart) - Recalculating control limits after process change.')